#!/usr/bin/env expect
############################################################################
# Purpose: Test of Slurm functionality
#          Test hdf5 acct_gather_profile (--profile=task)
############################################################################
#  Copyright (C) 2013 Bull S. A. S.
#		Bull, Rue Jean Jaures, B.P.68, 78340, Les Clayes-sous-Bois.
#
#  Written by Rod Schultz <rod.schultz@bull.com>
#
# This file is part of Slurm, a resource management program.
# For details, see <https://slurm.schedmd.com/>.
# Please also read the included file: DISCLAIMER.
#
# Slurm is free software; you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free
# Software Foundation; either version 2 of the License, or (at your option)
# any later version.
#
# Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
# details.
#
# You should have received a copy of the GNU General Public License along
# with Slurm; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301  USA.
############################################################################
source ./globals

set exit_code   0
set file_out    "test$test_id.output"
set file_prog   "test$test_id.prog"
set job_id      0

if {[get_config_param "FrontendName"] ne "MISSING"} {
	skip "This test is incompatible with front-end systems"
}

# Check if acct_gather_profile/hdf5 is installed
set profile 0
log_user 0
spawn $scontrol show config
expect {
	-re "acct_gather_profile/hdf5" {
		set profile 1
	}
	eof {
		wait
	}
}
log_user 1
if {$profile == 0} {
	skip "acct_gather_profile/hdf5 not installed on this system"
}
log_info "Acct_gather_profile/hdf5 plugin installed"
log_debug "Note: this test takes 3 minutes to run"

set task_freq [get_job_acct_freq]
if {$task_freq < 30} {
	log_warn "jobacct_gather_freq < 30 ($task_freq), results are unreliable"
}

#
# Build a test program to put a known load on the system
#
exec $bin_rm -f $file_prog
exec $bin_cc -I$build_dir $file_prog.c -lm -o $file_prog
exec $bin_chmod 700 $file_prog

set timeout [expr $max_job_delay + 200]

set srun_pid [spawn $srun --acctg-freq=$task_freq --profile=task -t5 ./$file_prog]
expect {
	-re "SLURM_JobId=($number)" {
		set job_id $expect_out(1,string)
		exp_continue
	}
	-re "error: PROFILE: Impossible to create the table" {
		exec rm -f $file_prog
		skip "This error is cause by a bug in HDF5 1.10.0, and is fixed in 1.10.1.  Please upgrade to run this test"
	}
	-re "error:" {
		fail "Something happened on start of job"
	}
	timeout {
		slow_kill $srun_pid
		fail "Srun not responding"
	}
	eof {
		wait
	}
}

set timeout 10
set srun_pid [spawn $sh5util -j $job_id]
expect {
	"Output file generated is empty" {
		slow_kill $srun_pid
		fail "Sh5util merge didn't make anything"
	}
	timeout {
		slow_kill $srun_pid
		fail "Sh5util merge not responding"
	}
	eof {
		wait
	}
}

set hdf5_file "job_$job_id.h5"
set srun_pid [spawn $sh5util -j $job_id -E -l Node:TimeSeries -s Tasks -o $file_out]
expect {
	"Output file generated is empty" {
		slow_kill $srun_pid
		fail "Sh5util extract didn't make anything"
	}
	timeout {
		fail "Sh5util extract not responding"
		slow_kill $srun_pid
	}
	eof {
		wait
	}
}

set line ""
set nerr 0
set lno 0
set fd 0
set last_et 0
set fd [open $file_out "r"]
set et_col -1
set cpu_util_col -1
set read_disk_col -1
while {$fd > 0 && [gets $fd line] != -1} {
	incr lno

	set tokens [split $line ","]

	if {$lno == 1} {
		set et_col [lsearch $tokens "ElapsedTime"]
		set cpu_util_col [lsearch $tokens "CPUUtilization"]
		set read_disk_col [lsearch $tokens "ReadMB"]

		if {$et_col == -1} {
			log_error "No ElapsedTime column found"
			set exit_code 1
		}
		if {$cpu_util_col == -1} {
			log_error "No CPUUtilization column found"
			set exit_code 1
		}
		if {$read_disk_col == -1} {
			log_error "No ReadMB column found"
			set exit_code 1
		}

		if {$exit_code} {
			break
		}
		continue
	}

	set et [lindex $tokens $et_col]
	set cur_et [expr $et - $last_et]
	set last_et $et

	if { $lno == 2 } {
		continue
	}

	if {$cur_et < $task_freq} {
		log_warn "Poll interval was only $cur_et instead of expected $task_freq on line $lno"
		incr nerr
	}

	set cputil [lindex $tokens $cpu_util_col]
	# The range on cpu utilization is pretty wide
	# Linux accounting resolution is only to one second, so in a
	# typical 30 interval an extra second is 3%. The burn loop
	# consumes a bit more that asked for. There is additional type
	# managing the I/O portion. Slurm and linux also consume some
	# cpu.
	if {$cputil < 38.0 || $cputil > 47.0 } {
		log_warn "CPU Busy $cputil not near 40% on line $lno"
		incr nerr
	}

	set rdmb [lindex $tokens $read_disk_col]
	set low_rd [expr 0.975 * 10 * $cur_et]
	set hi_rd  [expr 1.025 * 10 * $cur_et]

	if {$rdmb < $low_rd || $rdmb  > $hi_rd } {
		log_warn "Read Megabytes $rdmb not near 100.0 on line $lno"
		incr nerr
	}
}
close $fd

if {$lno == 0} {
	log_error "$file_out is empty"
	set exit_code 1
}

if {$nerr > 3} {
	log_error "$nerr is too many values out of range"
		set exit_code 1
}

if {$exit_code == 0} {
	exec rm -f $file_prog $file_out $hdf5_file
} else {
	exec mv $hdf5_file test$test_id.hdf5
}

if {$exit_code != 0} {
	fail "Test failed due to previous errors (\$exit_code = $exit_code)"
}
